library(tidyverse)
library(here)
library(hrbrthemes)
library(janitor)
library(corrplot)
RNGkind(sample.kind = "Rounding")
set.seed(1)
theme_set(theme_ipsum())
credit <- as_tibble(read_csv(here("data", "creditcard.csv")))
head(credit)
anyNA(credit)
[1] FALSE
for (i in 1:ncol(credit)){
print(summary(credit[, i]))
}
Time
Min. : 0
1st Qu.: 54202
Median : 84692
Mean : 94814
3rd Qu.:139320
Max. :172792
V1
Min. :-56.40751
1st Qu.: -0.92037
Median : 0.01811
Mean : 0.00000
3rd Qu.: 1.31564
Max. : 2.45493
V2
Min. :-72.71573
1st Qu.: -0.59855
Median : 0.06549
Mean : 0.00000
3rd Qu.: 0.80372
Max. : 22.05773
V3
Min. :-48.3256
1st Qu.: -0.8904
Median : 0.1799
Mean : 0.0000
3rd Qu.: 1.0272
Max. : 9.3826
V4
Min. :-5.68317
1st Qu.:-0.84864
Median :-0.01985
Mean : 0.00000
3rd Qu.: 0.74334
Max. :16.87534
V5
Min. :-113.74331
1st Qu.: -0.69160
Median : -0.05434
Mean : 0.00000
3rd Qu.: 0.61193
Max. : 34.80167
V6
Min. :-26.1605
1st Qu.: -0.7683
Median : -0.2742
Mean : 0.0000
3rd Qu.: 0.3986
Max. : 73.3016
V7
Min. :-43.5572
1st Qu.: -0.5541
Median : 0.0401
Mean : 0.0000
3rd Qu.: 0.5704
Max. :120.5895
V8
Min. :-73.21672
1st Qu.: -0.20863
Median : 0.02236
Mean : 0.00000
3rd Qu.: 0.32735
Max. : 20.00721
V9
Min. :-13.43407
1st Qu.: -0.64310
Median : -0.05143
Mean : 0.00000
3rd Qu.: 0.59714
Max. : 15.59500
V10
Min. :-24.58826
1st Qu.: -0.53543
Median : -0.09292
Mean : 0.00000
3rd Qu.: 0.45392
Max. : 23.74514
V11
Min. :-4.79747
1st Qu.:-0.76249
Median :-0.03276
Mean : 0.00000
3rd Qu.: 0.73959
Max. :12.01891
V12
Min. :-18.6837
1st Qu.: -0.4056
Median : 0.1400
Mean : 0.0000
3rd Qu.: 0.6182
Max. : 7.8484
V13
Min. :-5.79188
1st Qu.:-0.64854
Median :-0.01357
Mean : 0.00000
3rd Qu.: 0.66251
Max. : 7.12688
V14
Min. :-19.2143
1st Qu.: -0.4256
Median : 0.0506
Mean : 0.0000
3rd Qu.: 0.4931
Max. : 10.5268
V15
Min. :-4.49894
1st Qu.:-0.58288
Median : 0.04807
Mean : 0.00000
3rd Qu.: 0.64882
Max. : 8.87774
V16
Min. :-14.12985
1st Qu.: -0.46804
Median : 0.06641
Mean : 0.00000
3rd Qu.: 0.52330
Max. : 17.31511
V17
Min. :-25.16280
1st Qu.: -0.48375
Median : -0.06568
Mean : 0.00000
3rd Qu.: 0.39968
Max. : 9.25353
V18
Min. :-9.498746
1st Qu.:-0.498850
Median :-0.003636
Mean : 0.000000
3rd Qu.: 0.500807
Max. : 5.041069
V19
Min. :-7.213527
1st Qu.:-0.456299
Median : 0.003735
Mean : 0.000000
3rd Qu.: 0.458949
Max. : 5.591971
V20
Min. :-54.49772
1st Qu.: -0.21172
Median : -0.06248
Mean : 0.00000
3rd Qu.: 0.13304
Max. : 39.42090
V21
Min. :-34.83038
1st Qu.: -0.22839
Median : -0.02945
Mean : 0.00000
3rd Qu.: 0.18638
Max. : 27.20284
V22
Min. :-10.933144
1st Qu.: -0.542350
Median : 0.006782
Mean : 0.000000
3rd Qu.: 0.528554
Max. : 10.503090
V23
Min. :-44.80774
1st Qu.: -0.16185
Median : -0.01119
Mean : 0.00000
3rd Qu.: 0.14764
Max. : 22.52841
V24
Min. :-2.83663
1st Qu.:-0.35459
Median : 0.04098
Mean : 0.00000
3rd Qu.: 0.43953
Max. : 4.58455
V25
Min. :-10.29540
1st Qu.: -0.31715
Median : 0.01659
Mean : 0.00000
3rd Qu.: 0.35072
Max. : 7.51959
V26
Min. :-2.60455
1st Qu.:-0.32698
Median :-0.05214
Mean : 0.00000
3rd Qu.: 0.24095
Max. : 3.51735
V27
Min. :-22.565679
1st Qu.: -0.070840
Median : 0.001342
Mean : 0.000000
3rd Qu.: 0.091045
Max. : 31.612198
V28
Min. :-15.43008
1st Qu.: -0.05296
Median : 0.01124
Mean : 0.00000
3rd Qu.: 0.07828
Max. : 33.84781
Amount
Min. : 0.00
1st Qu.: 5.60
Median : 22.00
Mean : 88.35
3rd Qu.: 77.17
Max. :25691.16
Class
Min. :0.000000
1st Qu.:0.000000
Median :0.000000
Mean :0.001728
3rd Qu.:0.000000
Max. :1.000000
for (i in names(credit[, -31])) {
p <- ggplot(credit, aes_string(x = i)) +
geom_density(fill = "cornsilk")
print(p)
}
ggplot(data = credit, aes(x = Time, fill = Class)) +
geom_histogram() +
facet_wrap(~Class, scales = "free")
ggplot(data = credit, aes(x = log(Amount), fill = Class)) +
geom_histogram() +
facet_wrap(~Class, scales = "free")
ggplot(data = credit, aes(x = Time, y = log(Amount), alpha = 0.2)) +
geom_point() +
facet_wrap(~Class, scales = "free")
There is a clear skew to the Amount variable, so it is worth applying a transformation to the data. As there are values of 0, we need to add 1 to ensure that we don’t get Inf values produced after log transformation.
credit <- credit %>%
mutate(log_amount = log(Amount + 1))
summary(credit$log_amount)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.887 3.135 3.152 4.359 10.154
ggplot(credit, aes(x = log_amount)) +
geom_density(fill = "cornsilk")
corrplot(cor(credit[, -31]), method = "square", type = "upper")
Transformation of Amount has helped to reduce collinearities of the predictors. The only correlations are between V3 and Time, and V2 and log_amount. This is as expected as PCA produces orthogonal linear combinations, therefore there shouldn’t be much correlation between them.